suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(datasauRus))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(leaflet.extras2))Assignment1
Settings
idir <- "/Users/xinranwang/Documents/Course/25Fall/PM566/Data"EDA
(30 points) Given the formulated question from the assignment description, you will now conduct EDA Checklist items 1-5. First, download 2002 and 2022 data for all sites in California from the EPA Air Quality Data website, then read the data into R. For each of the two datasets, check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM). Write up a summary of all of your findings.
Summary. This data frame includes measurements at the state level (California), county level (51 counties), and city level (199 monitoring sites). PM2.5 was recorded using 21 different methods (see Method.Description for details) and reported in units of µg/m3 LC.
read_and_check <- function(idir, filename, year){ df <- read.csv(paste0(idir, "/", filename)) %>% mutate(Year = year) print(paste0("Dimension: ", paste(dim(df), collapse = " x "), "; # Rows: ", nrow(df), "; # Columns: ", ncol(df))) print(paste0("Variables names: ", paste(colnames(df), collapse = ", "))) print("Variable structures: ") str(df) return(df) }2002
df_2002 <- read_and_check(idir, "ad_viz_plotval_data_PM2.5_2002.csv", "2002")[1] "Dimension: 15976 x 23; # Rows: 15976; # Columns: 23" [1] "Variables names: Date, Source, Site.ID, POC, Daily.Mean.PM2.5.Concentration, Units, Daily.AQI.Value, Local.Site.Name, Daily.Obs.Count, Percent.Complete, AQS.Parameter.Code, AQS.Parameter.Description, Method.Code, Method.Description, CBSA.Code, CBSA.Name, State.FIPS.Code, State, County.FIPS.Code, County, Site.Latitude, Site.Longitude, Year" [1] "Variable structures: " 'data.frame': 15976 obs. of 23 variables: $ Date : chr "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ... $ Source : chr "AQS" "AQS" "AQS" "AQS" ... $ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ... $ POC : int 1 1 1 1 1 1 1 1 1 1 ... $ Daily.Mean.PM2.5.Concentration: num 25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ... $ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ... $ Daily.AQI.Value : int 81 93 74 82 98 115 89 62 69 107 ... $ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ... $ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ... $ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ... $ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ... $ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ... $ Method.Code : int 120 120 120 120 120 120 120 120 120 120 ... $ Method.Description : chr "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ... $ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ... $ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ... $ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ... $ State : chr "California" "California" "California" "California" ... $ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ... $ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ... $ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ... $ Site.Longitude : num -122 -122 -122 -122 -122 ... $ Year : chr "2002" "2002" "2002" "2002" ...kable(head(df_2002, 1), align = "c")Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC 81 Livermore 1 100 88101 PM2.5 - Local Conditions 120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2002 kable(tail(df_2002, 1), align = "c")Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC 33 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 117 R & P Model 2000 PM2.5 Sampler w/WINS 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2002 2022
df_2022 <- read_and_check(idir, "ad_viz_plotval_data_PM2.5_2022.csv", "2022")[1] "Dimension: 59918 x 23; # Rows: 59918; # Columns: 23" [1] "Variables names: Date, Source, Site.ID, POC, Daily.Mean.PM2.5.Concentration, Units, Daily.AQI.Value, Local.Site.Name, Daily.Obs.Count, Percent.Complete, AQS.Parameter.Code, AQS.Parameter.Description, Method.Code, Method.Description, CBSA.Code, CBSA.Name, State.FIPS.Code, State, County.FIPS.Code, County, Site.Latitude, Site.Longitude, Year" [1] "Variable structures: " 'data.frame': 59918 obs. of 23 variables: $ Date : chr "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ... $ Source : chr "AQS" "AQS" "AQS" "AQS" ... $ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ... $ POC : int 3 3 3 3 3 3 3 3 3 3 ... $ Daily.Mean.PM2.5.Concentration: num 12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ... $ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ... $ Daily.AQI.Value : int 58 60 39 21 23 21 13 38 59 55 ... $ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ... $ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ... $ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ... $ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ... $ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ... $ Method.Code : int 170 170 170 170 170 170 170 170 170 170 ... $ Method.Description : chr "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ... $ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ... $ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ... $ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ... $ State : chr "California" "California" "California" "California" ... $ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ... $ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ... $ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ... $ Site.Longitude : num -122 -122 -122 -122 -122 ... $ Year : chr "2022" "2022" "2022" "2022" ...kable(head(df_2022, 1), align = "c")Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC 58 Livermore 1 100 88101 PM2.5 - Local Conditions 170 Met One BAM-1020 Mass Monitor w/VSCC 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2022 kable(tail(df_2022, 1), align = "c")Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 59918 12/31/2022 AQS 61131003 1 1 ug/m3 LC 6 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 145 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2022 identical(colnames(df_2002), colnames(df_2022))[1] TRUENote. Here, I combined data frames as they have identical column names.
df <- rbind(df_2002, df_2022) rm(df_2002, df_2022)Distribution of PM 2.5
Summary. There is a striking decrease in daily PM2.5 concentrations from 2002 to 2022. Mean and median PM 2.5 were cut in half. However, there are more outliers in 2022 and max PM 2.5 increased by 3 fold
df <- df %>% rename(PM2.5 = Daily.Mean.PM2.5.Concentration) df$Year <- factor(df$Year, levels = c("2002", "2022")) summary1 <- df %>% group_by(Year) %>% summarise( mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = TRUE), 2), min = min(PM2.5, na.rm = TRUE), q25 = quantile(PM2.5, 0.25, na.rm = TRUE), median = median(PM2.5, na.rm = TRUE), q75 = quantile(PM2.5, 0.75, na.rm = TRUE), max = max(PM2.5, na.rm = TRUE) ) kable(as.data.frame(summary1), align = "c")Year mean sd min q25 median q75 max 2002 16.12 13.87 0.0 7.0 12.0 20.5 104.3 2022 8.41 7.64 -6.7 4.1 6.8 10.7 302.5 (10 points) Combine the two years of data into one data frame. Use the
Datevariable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.Note. (1) I’ve created new column for year when reading data, (2) I’ve renamed Daily.Mean.PM2.5.Concentration to PM2.5 in previous step (3) I’ve combined data in previous step (before plotting).
(20 points) Create a basic map (or maps) in
leafletshowing the locations of the monitoring sites, using different colors for each year. Summarize the spatial distribution of the sites. How does this distribution change from 2002 to 2022?Map (2002 vs 2022)
Summary. By 2022, more sites had been established, frequently clustering near the 2002 sites (see map above). The number of sites increased 3.75-fold from 2002 to 2022 (rising from 15,978 to 59,918 sites; see histrogram and table in Question 4).
# Map #scales::hue_pal()(2) pal <- leaflet::colorFactor(c("#F8766D", "#00BFC4"), domain = levels(df$Year)) leaflet(df) %>% addMapPane("left", zIndex = 410) %>% addMapPane("right", zIndex = 410) %>% addProviderTiles('OpenStreetMap', options = pathOptions(pane = "left"), layerId = "layer_2002") %>% addProviderTiles('OpenStreetMap', options = pathOptions(pane = "right"), layerId = "layer_2022") %>% addCircles(data = df %>% filter(Year == 2002), lat = ~Site.Latitude, lng = ~Site.Longitude, fillColor = "#F8766D", color = "black", stroke = TRUE, weight = 0.2, opacity = 0.5, fillOpacity = 1, radius = 100, options = pathOptions(pane = "left"), group = "2002") %>% addCircles(data = df %>% filter(Year == 2022), lat = ~Site.Latitude, lng = ~Site.Longitude, fillColor = "#00BFC4", color = "black", stroke = TRUE, weight = 0.2, opacity = 0.5, fillOpacity = 1, radius = 100, options = pathOptions(pane = "right"), group = "2022") %>% addLegend("topright", title = "Location of Monitor Sites", pal = pal, values = ~Year, labels = ~Year ) %>% addLayersControl(overlayGroups = ~c("2002", "2022"), options = layersControlOptions(collapsed = F)) %>% addSidebyside( layerId = "sidecontrols", leftId = "layer_2002", rightId = "layer_2022")#https://rdrr.io/cran/leaflet.extras2/man/addSidebyside.html(10 points) Check for any data issues such as missing or implausible values of PM in the combined dataset. Calculate the proportion of missing/implausible values for each year and report any temporal patterns you see in these observations.
Missing values. There are no missing values in either year.
Implausible values. In 2022, 215 out of 59,918 observations (0.36%) were implausible (defined as PM2.5 < 0).
Temporal Patterns. The number of implausible observations increased in 2022, although the count remained very small overall).
summary2 <- df %>% group_by(Year) %>% summarise( `# Total` = n(), `# Missing` = sum(is.na(PM2.5)), `# Implausible` = sum(PM2.5 < 0, na.rm = TRUE) ) %>% mutate( `% Missing` = paste0(round((`# Missing` / `# Total`) * 100, 2), "%"), `% Implausible` = paste0(round((`# Implausible` / `# Total`) * 100, 2), "%")) %>% select(`# Total`, `# Missing`, `% Missing`, `# Implausible`, `% Implausible`) kable(as.data.frame(summary2), align = "c")# Total # Missing % Missing # Implausible % Implausible 15976 0 0% 0 0% 59918 0 0% 215 0.36% # Count of PM 2.5 Concentration df$Year <- factor(df$Year, levels = c("2022", "2002")) # 2002 will be drawn last (on top) ggplot(df, aes(x = PM2.5, fill = Year, color = Year)) + geom_histogram(bins = 100, alpha = 0.5, position = "identity") + labs(title = "Histrogram of Daily PM 2.5 Concentration", x = "PM2.5 Concentration", y = "Count") + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + scale_fill_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_classic()df$Year <- factor(df$Year, levels = c("2002", "2022")) # 2002 will be drawn last (on top)(30 points) Explore the main question of interest at three different levels of spatial resolution. Create exploratory plots (e.g. boxplots, histograms, line plots, violin plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.
Level 1: State. Examine the primary question for the entire state.
Level 2: County. Examine the primary question for every county in California.
Level 3: City. Restrict the data to sites in Los Angeles county and examine the primary question for every site.
Level 1: State
Summary. The mean of daily PM 2.5 in 2022 (mean: 8.41; sd: 7.64) is lower than than 2002 (mean: 16.12; sd: 13.87), despite more outliers.
summary3 <- df %>% group_by(Year) %>% summarise( n = n(), mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = T), 2)) %>% mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>% select(Year, `Mean (SD)`) %>% arrange(Year) kable(as.data.frame(summary3), align = "c")Year Mean (SD) 2002 16.12 (13.87) 2022 8.41 (7.64) ggplot(df, aes_string(x = "Year", y = "PM2.5", fill = "Year", color = "Year")) + geom_boxplot(color = "black", size = 0.2) + labs(title = paste0(" Daily PM2.5 Concentration (2002 vs 2022) in California"), x = "Year", y = paste0("Daily PM2.5 Concentration")) + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_minimal() + theme(legend.position = "bottom", legend.box = "horizontal", legend.justification = "center")Warning: `aes_string()` was deprecated in ggplot2 3.0.0. ℹ Please use tidy evaluation idioms with `aes()`. ℹ See also `vignette("ggplot2-in-packages")` for more information.Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.Level 2: County
Summary. The mean of daily PM 2.5 for most counties in California in 2022 are lower than that of 2000, except for Trinity, Siskiyou, Mono, and Del Norte.
summary4 <- df %>% group_by(County, Year) %>% summarise( n = n(), mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = TRUE), 2)) %>% mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>% select(Year, County, `Mean (SD)`) %>% pivot_wider( names_from = Year, values_from = `Mean (SD)`, names_prefix = "Mean (SD) in ") %>% select(County, `Mean (SD) in 2002`, `Mean (SD) in 2022`)`summarise()` has grouped output by 'County'. You can override using the `.groups` argument.kable(as.data.frame(summary4), align = "c")County Mean (SD) in 2002 Mean (SD) in 2022 Alameda 14.25 (11.43) 8.2 (4.95) Butte 14.76 (11.66) 6.19 (5.79) Calaveras 9.9 (6.5) 6.04 (4.1) Colusa 11.73 (10.01) 7.61 (4.76) Contra Costa 15.09 (14.47) 8.24 (4.93) Del Norte 3.82 (2.56) 4.97 (3.58) El Dorado 4.91 (4.07) 4.07 (9.18) Fresno 19.93 (18.91) 10.19 (8.3) Glenn NA 5.34 (4.98) Humboldt 7.79 (4.59) 6.76 (4.35) Imperial 12.71 (7.7) 9.55 (6.39) Inyo 7.14 (10.49) 5.55 (5.47) Kern 23 (18.72) 9.88 (9.91) Kings 24.7 (19.06) 14.4 (10.78) Lake 6.5 (10.88) 4.26 (3.12) Los Angeles 19.66 (11.88) 10.97 (5.24) Madera NA 10.39 (7.26) Marin 7.31 (4.87) 6.54 (4.17) Mariposa 9.48 (5.97) 9.47 (17.78) Mendocino 8.85 (8.79) 10.14 (5.19) Merced 21.54 (15.32) 10.06 (7.32) Modoc 5 (0) NA Mono 2.68 (2.57) 4.43 (6.09) Monterey 8.97 (4.53) 4.75 (3.21) Nevada 7.36 (3.41) 7.59 (15.86) Orange 17.83 (10.55) 9.24 (4.43) Placer 13.1 (10.24) 7.28 (15.48) Plumas 12.86 (8.94) 11.23 (10.33) Riverside 22.37 (16.1) 9.13 (6.27) Sacramento 15.17 (14.51) 8.25 (7.54) San Benito 4.99 (2.92) 4.71 (2.69) San Bernardino 16.3 (13.08) 9.42 (5.19) San Diego 15.27 (7.65) 8.3 (4.36) San Francisco 15.31 (14.21) 6.78 (5.14) San Joaquin 16.76 (12.26) 7.9 (7.81) San Luis Obispo 8.88 (5.15) 7.1 (4.52) San Mateo 12.24 (8.91) 6.8 (4.94) Santa Barbara 7.04 (3.92) 6.06 (3.51) Santa Clara 14.83 (10.65) 8.77 (5.59) Santa Cruz 8.55 (4.34) 5.23 (3.74) Shasta 7.25 (8.38) 5.04 (4.67) Siskiyou 2.69 (2.06) 7.6 (17.3) Solano 15.48 (14.15) 7.9 (5.46) Sonoma 11.13 (9.5) 6.38 (4.29) Stanislaus 19.66 (17.09) 12.12 (8.42) Sutter 12.82 (10.2) 10.04 (6.6) Tehama NA 5.87 (4.5) Trinity 2.78 (2.4) 10.73 (23.1) Tulare 23.1 (16.15) 10.41 (7.53) Ventura 13.4 (7.24) 6.85 (3.83) Yolo 10.58 (8.86) 5.44 (5.29) df_couty <- df %>% group_by(Year, County) %>% summarise( n = n(), mean_PM2.5 = mean(PM2.5, na.rm = T), sd_PM2.5 = sd(PM2.5, na.rm = T), ) %>% mutate(se_PM2.5 = sd_PM2.5 / sqrt(n))`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.ggplot(df_couty, aes(y = County, x = mean_PM2.5, xmin = mean_PM2.5 - 1.96 * se_PM2.5, xmax = mean_PM2.5 + 1.96 * se_PM2.5, color = Year, group = Year)) + geom_pointrange(size = 0.3, fatten = 1.2) + labs(title = paste0("Daily PM2.5 Concentration (2002 vs 2022), by County"), x = "Daily PM2.5 Concentration (95% CI)", y = "County",) + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_minimal() + theme(legend.position = "bottom", legend.box = "horizontal", legend.justification = "center")Level 3: Sites in LA County
Summary. The mean of daily PM 2.5 for all cities in Los Angeles county in 2022 are lower than that of 2000
summary5 <- df %>%
filter(County == "Los Angeles",
Year %in% c("2002", "2022"),
Local.Site.Name != "") %>%
group_by(Year, Local.Site.Name) %>%
summarise(
n = n(),
mean = round(mean(PM2.5, na.rm = T), 2),
sd = round(sd(PM2.5, na.rm = TRUE), 2)) %>%
mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
select(Year, Local.Site.Name, `Mean (SD)`) %>%
pivot_wider(
names_from = Year,
values_from = `Mean (SD)`,
names_prefix = "Mean (SD) in ") %>%
mutate(`Local Site Name` = Local.Site.Name) %>%
select(`Local Site Name`, `Mean (SD) in 2002`, `Mean (SD) in 2022`) %>%
arrange(`Local Site Name`) `summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
kable(as.data.frame(summary5), align = "c")| Local Site Name | Mean (SD) in 2002 | Mean (SD) in 2022 |
|---|---|---|
| Azusa | 20.76 (12.11) | 9.72 (4.39) |
| Burbank | 23.97 (12.74) | NA |
| Compton | NA | 12.99 (6.22) |
| Glendora | NA | 8.42 (5.47) |
| Lancaster-Division Street | 10.38 (4.47) | 7.52 (2.48) |
| Lebec | 4.82 (2.78) | 3.5 (1.65) |
| Long Beach (North) | 19.47 (10.39) | 9.92 (3.27) |
| Long Beach (South) | NA | 11.97 (4.32) |
| Long Beach-Route 710 Near Road | NA | 13.02 (5.61) |
| Los Angeles-North Main Street | 21.97 (11.69) | 11.58 (4.57) |
| Lynwood | 23.35 (11.96) | NA |
| North Hollywood (NOHO) | NA | 13.02 (4.75) |
| Pasadena | 20.29 (11.14) | 9.09 (3.68) |
| Pico Rivera #2 | NA | 11.45 (5.93) |
| Reseda | 18.85 (10.65) | 10.72 (4.56) |
| Santa Clarita | NA | 9.14 (3.9) |
| Signal Hill (LBSH) | NA | 8.85 (4.42) |
ggplot(df %>%
filter(County == "Los Angeles",
Year %in% c("2002", "2022"),
Local.Site.Name != ""),
aes(x = PM2.5,
y = Local.Site.Name,
fill = Year)) +
geom_violin(alpha = 0.6, scale = "width",
position = position_dodge(width = 0.8), trim = FALSE) +
labs(
title = "Daily PM2.5 Concentration in LA County",
subtitle = "By monitoring sites (2002 vs 2022)",
x = "Daily PM2.5 Concentration",
y = "Site",
fill = "Year") +
scale_color_manual(name = "Year",
values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
scale_fill_manual(name = "Year",
values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
theme_minimal() +
theme(
legend.position = "bottom",
legend.box = "horizontal",
legend.justification = "center"
)Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.
Reminder: after you upload your final rendered document to GitHub, you should download it to make sure that it looks right! If you haven’t included embed-resources: true in the YAML header, none of your figures will show up!